
#################################
# Selection of relevant variables
#################################


# Project and author information
################################
# Project:    prlx
# Programme:  prlx-v01-management01.r
# R version:  2.10.1
# Date:       13 November 2010
# Author:     Frank Haege, University of Limerick
# Contact:    frank.haege@ul.ie

# Script description
####################
# This script takes the complete EUPOL dataset as input data and creates a 
# smaller dataset that includes only those variables that are relevant for 
# the data analysis of this project. 


# Clear working environment
rm(list = ls())

# Set working directory
setwd("E:/Seagate Sync/VOL/My Documents/Research/Current projects/prlx/Posted/101113 EUP publication/Data analysis")

# Load EUPOL dataset (note that this can take 2-3 minutes)
prlx = read.csv("prlx-v01-data.csv", 
       na.strings = c("None", "NONE", "NA"))
dim(prlx)

# Sort variable columns alphabetically
prlx = prlx[ ,sort(names(prlx))]

# Delete one observation with missing values (even no website number!)
prlx = prlx[prlx$no != "[]", ]
dim(prlx)

# Move the proposal page number variable to the front of the dataset
prlx = data.frame(cbind(prlx$no, prlx[ , names(prlx) != "no"]))

# Rename the proposal page number variable
names(prlx)[1] = "webno"

# Identify the number of variables and observations 
dim(prlx)
# EUPOL includes 2600 variables (plus the row.names variable created by R)
# and 29366 observations


# Delete variables that refer to irrelevant event characteristics
#################################################################

# Delete all variables that refer to documents
grep('_documents_', names(prlx), value=T)
documents = grep('_documents_', names(prlx))
length(documents)         
prlx = prlx[,-documents]
dim(prlx)

# Delete all variables that refer to CELEX numbers
grep('_celeNumb_', names(prlx), value=T)
celex = grep('_celeNumb_', names(prlx))
length(celex)         
prlx = prlx[,-celex]
dim(prlx)


# Select variables that refer to relevant events
################################################

# Select variable names of relevant events
replacement = grep('^replacement_', names(prlx), value = TRUE)
partAdopByCoun = grep('^partAdopByCoun_', names(prlx), value = TRUE)
fielOfActi = grep('^fielOfActi_', names(prlx), value = TRUE)
typeOfFile = grep('^typeOfFile_', names(prlx), value = TRUE)
procedures = grep('^procedures_', names(prlx), value = TRUE)
adopByComm = grep('^adopByComm_', names(prlx), value = TRUE)
tranToEp = grep('^tranToEp_', names(prlx), value = TRUE)
tranToCoun = grep('^tranToCoun_', names(prlx), value = TRUE)
epCtteRepo1stRdg = grep('^epCtteRepo1stRdg_', names(prlx), value = TRUE)
epOpin1stRdg = grep('^epOpin1stRdg_', names(prlx), value = TRUE)
epCtteRepoSingRdg = grep('^epCtteRepoSingRdg_', names(prlx), value = TRUE)
epOpinSingRdg = grep('^epOpinSingRdg_', names(prlx), value = TRUE)
commPosiOnEpAmenOn1stRead = grep('^commPosiOnEpAmenOn1stRead_', 
                            names(prlx), value = TRUE)
commPosiOnEpAmenOnSingRead = grep('^commPosiOnEpAmenOnSingRead_', 
                             names(prlx), value = TRUE)
adopAmenProp = grep('^adopAmenProp_', names(prlx), value = TRUE)  
formAdopByCoun = grep('^formAdopByCoun_', names(prlx), value = TRUE)
polAgreCommPosi = grep('^polAgreCommPosi_', names(prlx), value = TRUE)
adopCommPosi = grep('^adopCommPosi_', names(prlx), value = TRUE)
counAppr1stRdg = grep('^counAppr1stRdg_', names(prlx), value = TRUE)  
counAgre = grep('^counAgre_', names(prlx), value = TRUE)  
formAdopByCoun = grep('^formAdopByCoun_', names(prlx), value = TRUE)  
chanOfLegaBasiByComm = grep('^chanOfLegaBasiByComm_', names(prlx), value = TRUE)
tranAmenPropToCoun = grep('^tranAmenPropToCoun_', names(prlx), value = TRUE)
tranAmenPropToEp = grep('^tranAmenPropToEp_', names(prlx), value = TRUE)
signByEpAndCoun = grep('^signByEpAndCoun_', names(prlx), value = TRUE)
withByComm = grep('^withByComm_', names(prlx), value = TRUE)
rejeByCoun = grep('^rejeByCoun_', names(prlx), value = TRUE)
noAdopByCoun = grep('^noAdopByCoun_', names(prlx), value = TRUE)
counAppr2ndRdg = grep('^counAppr2ndRdg_', names(prlx), value = TRUE)
partSignByEpAndCoun = grep('^partSignByEpAndCoun_', names(prlx), value = TRUE)
counDeciAt3rdRdg = grep('^counDeciAt3rdRdg_', names(prlx), value = TRUE)
epDeciOn3rdRdg = grep('^epDeciOn3rdRdg_', names(prlx), value = TRUE)
epOpin2ndRdg = grep('^epOpin2ndRdg_', names(prlx), value = TRUE)
concCommDeci = grep('^concCommDeci_', names(prlx), value = TRUE)
epDeciOn3rdRdg = grep('^epDeciOn3rdRdg_', names(prlx), value = TRUE)
epOpin2ndRdg = grep('^epOpin2ndRdg_', names(prlx), value = TRUE)
 
# Combine all selected variable names into a list
variables = c("webno", "procCode", "propCode", "title", "runno", 
              "legaBasi_original", fielOfActi, typeOfFile, procedures, 
              adopByComm, tranToEp, tranToCoun, epCtteRepo1stRdg, epOpin1stRdg, 
              epCtteRepoSingRdg, epOpinSingRdg, commPosiOnEpAmenOn1stRead, 
              commPosiOnEpAmenOnSingRead, adopAmenProp, polAgreCommPosi, 
              adopCommPosi, counAppr1stRdg, counAgre, formAdopByCoun, 
              partAdopByCoun, replacement, chanOfLegaBasiByComm, 
              tranAmenPropToEp, tranAmenPropToCoun, signByEpAndCoun, 
              withByComm, rejeByCoun, noAdopByCoun, counAppr2ndRdg, 
              partSignByEpAndCoun, counDeciAt3rdRdg, epDeciOn3rdRdg, 
              epOpin2ndRdg, concCommDeci, epDeciOn3rdRdg, epOpin2ndRdg)                                                


# Generate a new data frame including only the selected variables
#################################################################

# Check the length of the variable list
length(variables)

# Check the number of variables and observations in the original data frame
dim(prlx)

# Replace the original data frame by the subset of relevant variables
prlx = subset(prlx, select = variables)

# Check the number of variables and observations in the new data frame
dim(prlx)

# Summarise the new data frame
summary(prlx)

# Sort variable columns alphabetically
names(prlx)
prlx = prlx[ ,sort(names(prlx))]


# Save data frame as a comma-separated text file
write.csv(prlx, "prlx-v01-management01.csv")

